##############
# This script performs the lasso-selection of covariates that are correlated with the contact button click, gender, and workvolume
# Author: Daniel Kopp
##############

library(data.table)
library(Matrix)
library(foreach)
library(glmnet)
library(glmnetUtils)
library(readr)
library(foreign)
library(tidyverse)
library(MASS)
library(biglasso)
library(bigmemory)
library(bigmemory.sri)
library(ncvreg)
library(lfe)
library(fastDummies)

rm(list = ls())
gc()  # garbage collector

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# Set seed
set.seed(1234)

# Load data 
with_interactions = haven::read_dta("data_processed/lasso_20p_rand_sample_parttime.dta")

# We drop observations with missing values
with_interactions <- with_interactions[complete.cases(with_interactions),]
gc()  # garbage collector

# Transform work volume variable (6 categories) into variable with 3 categories
with_interactions <- with_interactions %>% rowwise() %>% mutate(workvolume_cat0 = workvolume_det_cat0)
with_interactions <- with_interactions %>% rowwise() %>% mutate(workvolume_cat1 = max(c(workvolume_det_cat1, workvolume_det_cat2)))
with_interactions <- with_interactions %>% rowwise() %>% mutate(workvolume_cat2 = max(c(workvolume_det_cat3, workvolume_det_cat4)))
with_interactions <- with_interactions %>% rowwise() %>% mutate(workvolume_cat3 = max(c(workvolume_det_cat5, workvolume_det_cat6)))

with_interactions <- with_interactions %>% dplyr::select(-workvolume_det_cat0, -workvolume_det_cat1, -workvolume_det_cat2, -workvolume_det_cat3, -workvolume_det_cat4, -workvolume_det_cat5, -workvolume_det_cat6)

fixed_effects <- with_interactions %>% dplyr::select(search_tag, kanton, rank)

#######################
# Partialling out of search fixed effects, rank fixed effects and canton fixed effects
# To reduce memory usage, we split the data into several badges
#######################

vars_partialling_out1 <- with_interactions[,1:500] %>% dplyr::select(-search_tag, -tracking_id, -click_candidate, -stes_id, -kanton, -rank, -kanton1, -kanton2, -kanton3, -kanton4, -kanton5, -kanton6, -kanton7, -kanton8, -kanton9, -kanton10, -kanton11, -kanton12, -kanton13, -kanton14, -kanton15, -kanton16, -kanton17, -kanton18, -kanton19, -kanton20, -kanton21, -kanton22, -kanton23, -kanton24, -kanton25, -kanton26) %>% as.matrix()
dim(vars_partialling_out1)

vars_partialling_out1 <- residuals(felm(vars_partialling_out1 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out1) <- str_replace_all(colnames(vars_partialling_out1),"vars_partialling_out1.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out2 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out2)
vars_partialling_out2 <- residuals(felm(vars_partialling_out2 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out2) <- str_replace_all(colnames(vars_partialling_out2),"vars_partialling_out2.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out3 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out3)
vars_partialling_out3 <- residuals(felm(vars_partialling_out3 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out3) <- str_replace_all(colnames(vars_partialling_out3),"vars_partialling_out3.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out4 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out4)
vars_partialling_out4 <- residuals(felm(vars_partialling_out4 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out4) <- str_replace_all(colnames(vars_partialling_out4),"vars_partialling_out4.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out5 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out5)
vars_partialling_out5 <- residuals(felm(vars_partialling_out5 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out5) <- str_replace_all(colnames(vars_partialling_out5),"vars_partialling_out5.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out6 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out6)
vars_partialling_out6 <- residuals(felm(vars_partialling_out6 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out6) <- str_replace_all(colnames(vars_partialling_out6),"vars_partialling_out6.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out7 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out7)
vars_partialling_out7 <- residuals(felm(vars_partialling_out7 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out7) <- str_replace_all(colnames(vars_partialling_out7),"vars_partialling_out7.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out8 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out8)
vars_partialling_out8 <- residuals(felm(vars_partialling_out8 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out8) <- str_replace_all(colnames(vars_partialling_out8),"vars_partialling_out8.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out9 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out9)
vars_partialling_out9 <- residuals(felm(vars_partialling_out9 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out9) <- str_replace_all(colnames(vars_partialling_out9),"vars_partialling_out9.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out10 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out10)
vars_partialling_out10 <- residuals(felm(vars_partialling_out10 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out10) <- str_replace_all(colnames(vars_partialling_out10),"vars_partialling_out10.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out11 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out11)
vars_partialling_out11 <- residuals(felm(vars_partialling_out11 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out11) <- str_replace_all(colnames(vars_partialling_out11),"vars_partialling_out11.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out12 <- with_interactions[,1:500]  %>% as.matrix()
dim(vars_partialling_out12)
vars_partialling_out12 <- residuals(felm(vars_partialling_out12 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out12) <- str_replace_all(colnames(vars_partialling_out12),"vars_partialling_out12.","")
with_interactions <- with_interactions[,-(1:500)]
gc()  # garbage collector

vars_partialling_out13 <- with_interactions  %>% as.matrix()
dim(vars_partialling_out13)
vars_partialling_out13 <- residuals(felm(vars_partialling_out13 ~ -1  | search_tag + kanton + rank, data=fixed_effects,nostats=TRUE))
colnames(vars_partialling_out13) <- str_replace_all(colnames(vars_partialling_out13),"vars_partialling_out13.","")
rm(with_interactions)
gc()  # garbage collector


# Bind all elements together (in three steps to save memory):
partialled_out <- cbind(vars_partialling_out1,vars_partialling_out2,vars_partialling_out3,vars_partialling_out4,vars_partialling_out5)
rm(vars_partialling_out1,vars_partialling_out2,vars_partialling_out3,vars_partialling_out4,vars_partialling_out5)
gc()  # garbage collector

partialled_out <- cbind(partialled_out,vars_partialling_out6,vars_partialling_out7,vars_partialling_out8,vars_partialling_out9)
rm(vars_partialling_out6,vars_partialling_out7,vars_partialling_out8,vars_partialling_out9)
gc()  # garbage collector

partialled_out <- cbind(partialled_out,vars_partialling_out10,vars_partialling_out11,vars_partialling_out12,vars_partialling_out13)
rm(vars_partialling_out10,vars_partialling_out11,vars_partialling_out12,vars_partialling_out13)
gc()  # garbage collector



######################################################################
######################################################################
# Lasso selecion
######################################################################
######################################################################

head(partialled_out[,1:30])

# We store all workvolume variables in a separate matrix:
workvolume <- partialled_out[,(startsWith(colnames(partialled_out),"workvolume_cat"))] 
head(workvolume)

# We drop all workvolume variables from "partialled_out":
partialled_out <- partialled_out[,!(startsWith(colnames(partialled_out),"workvolume_cat"))]
head(partialled_out[,1:20])
gc()  # garbage collector

# We store the gender variable in a separate vector:
gender <- partialled_out[,2]
gender <- as.matrix(gender)

# Drop gender from partialled_out:
partialled_out <- partialled_out[,-2]
gc()  # garbage collector

# We store the dependent var contact_button_clicked in a separate matrix:
contact_button <- partialled_out[,1]
contact_button <- as.matrix(contact_button)

# Drop contact_button_clicked from partialled_out:
partialled_out <- partialled_out[,-1]
gc()  # garbage collector


######################################################################
# Lasso estimation according to Double Lasso Var selection
######################################################################

## Convert X to a big.matrix object
# Advantage of big.matrix is that it does not use a lot of memory. However, it writes the matrix on the C-drive. 
# Hence, we need sufficient (temporary) storage space on the C drive (at least 140 GB)!

partialled_out <- as.big.matrix(as.data.table(partialled_out))
dim(partialled_out)
gc()


#####################################################################
# STEP 1: select variables that predict outcome
#####################################################################

n=nrow(partialled_out)
p=ncol(partialled_out)
sda = sd(contact_button)
lambda2 = sda*(1.1/sqrt(n))* qnorm(1 - (.1/log(n))/(2*p)) # We use lambda/2N as regularization parameter as in Belloni et al. (2014: 615). Instead of (2.2*sqrt(n))/2n we write 1.1/sqrt(n)
summary(lambda2)
ylambdas <- rep(0, 9)
ysdas <- rep(0, 9)
k = 1
while(k < 5){
  message(sprintf("Now at k = %d", k))
  fitY = biglasso(partialled_out, contact_button, lambda=lambda2)
  ba = coef(fitY, s = lambda2)
  ea = contact_button-predict(fitY,partialled_out)
  sda = sd(ea)
  lambda2 = sda*(1.1/sqrt(n))* qnorm(1 - (.1/log(n))/(2*p))  # Here we use lambda/2N as regularization parameter.  Instead of (2.2*sqrt(n))/2n we write 1.1/sqrt(n)
  # SELECT ONE LAMBDA!
  ylambdas[k] <- lambda2
  ysdas[k] <- sda
  if(k > 1) { 
    if ((ylambdas[k] - ylambdas[k-1])/ylambdas[k-1] < 1/1000000) {
      break
    }
  }
  k = k+1
}
ba_df <- data.table(
  var_ysel = rownames(ba),
  coeff_ysel = ba[,1],
  lambda = ylambdas[k-1] 
)
ba_df <- ba_df[abs(coeff_ysel) > 0]



#####################################################################
# STEP 2: select variables that predict gender
#####################################################################


n=nrow(partialled_out)
p=ncol(partialled_out)
sda = sd(gender)
lambda2 = sda*(1.1/sqrt(n))* qnorm(1 - (.1/log(n))/(2*p)) # We use lambda/2N as regularization parameter.  Instead of (2.2*sqrt(n))/2n we write 1.1/sqrt(n) 
summary(lambda2)
lambdas <- rep(0, 9)
sdas <- rep(0, 9)
k = 1
while(k < 5){
  message(sprintf("Now at k = %d", k))
  localfit = biglasso(partialled_out, gender, lambda=lambda2)  
  ba = coef(localfit, s = lambda2)
  ea = gender-predict(localfit,partialled_out)
  sda = sd(ea)
  lambda2 = sda*(1.1/sqrt(n))* qnorm(1 - (.1/log(n))/(2*p)) # We use lambda/2N as regularization parameter.  Instead of (2.2*sqrt(n))/2n we write 1.1/sqrt(n)
  lambdas[k] <- lambda2
  sdas[k] <- sda
  if(k > 1) { 
    if ((lambdas[k] - lambdas[k-1])/lambdas[k-1] < 1/1000000) {
      break
    }
  }
  k = k+1
}   
ba_df_geschl <- data.table(
  var_geschsel = rownames(ba),
  coeff_geschsel = ba[,1],
  lambda = lambdas[k-1] 
)
ba_df_geschl <- ba_df_geschl[abs(coeff_geschsel) > 0]

#####################################################################
# STEP 2: select variables that predict Workvolume
#####################################################################

workvolume_cat0 <- as.matrix(workvolume[,1])
workvolume_cat1 <- as.matrix(workvolume[,2])
workvolume_cat2 <- as.matrix(workvolume[,3])
workvolume_cat3 <- as.matrix(workvolume[,4])

n=nrow(partialled_out)
p=ncol(partialled_out)
fit_lambdas <- list()
fit_sds <- list()
for(i in c(0:3)) {
  var_volume <- paste0("workvolume_cat", i)
  var_fit <- paste0("fitT", i)
  sd1=sd(get(var_volume))
  lambda2 = sd1*(1.1/sqrt(n))* qnorm(1 - (.1/log(n))/(2*p)) # We use lambda/2N as regularization parameter.  Instead of (2.2*sqrt(n))/2n we write 1.1/sqrt(n) 
  summary(lambda2)
  lambdas <- rep(0, 10)
  sdas <- rep(0, 10)
  k = 1
  while(k < 5){
    message(sprintf("Now at k = %d", k))
    localfit = biglasso(partialled_out, get(var_volume), lambda=lambda2)  
    ba = coef(localfit, s = lambda2)
    ea = get(var_volume)-predict(localfit,partialled_out)
    sda = sd(ea)
    lambda2 = sda*(1.1/sqrt(n))* qnorm(1 - (.1/log(n))/(2*p)) # We use lambda/2N as regularization parameter.  Instead of (2.2*sqrt(n))/2n we write 1.1/sqrt(n)
    lambdas[k] <- lambda2
    sdas[k] <- sda
    if(k > 1) { 
      if ((lambdas[k] - lambdas[k-1])/lambdas[k-1] < 1/1000000) {
        break
      }
    }
    k = k+1
  }  
  ba_df_y <- data.table(
    var_volumesel = rownames(ba),
    coeff_volumesel = ba[,1],
    lambda = lambdas[k-1] 
  )
  ba_df_y <- ba_df_y[abs(coeff_volumesel) > 0]
  ba_df_yname <- paste0("ba_df_", var_volume)
  assign(ba_df_yname,ba_df_y)
  fit_lambdas[[var_volume]] <- lambdas
  fit_sds[[var_volume]] <- sdas
  assign(var_fit, localfit)
}

ba_df_varnames <- unlist(ba_df[,"var_ysel"], use.names=FALSE)
ba_df_ba_df_workvolume_cat0_names <- unlist(ba_df_workvolume_cat0[,"var_volumesel"], use.names=FALSE)
ba_df_ba_df_workvolume_cat1_names <- unlist(ba_df_workvolume_cat1[,"var_volumesel"], use.names=FALSE)
ba_df_ba_df_workvolume_cat2_names <- unlist(ba_df_workvolume_cat2[,"var_volumesel"], use.names=FALSE)
ba_df_ba_df_workvolume_cat3_names <- unlist(ba_df_workvolume_cat3[,"var_volumesel"], use.names=FALSE)
ba_df_geschlecht_varnames <- unlist(ba_df_geschl[,"var_geschsel"], use.names=FALSE)

use_new = union(ba_df_varnames,ba_df_ba_df_workvolume_cat0_names)
use_new = union(use_new,ba_df_ba_df_workvolume_cat1_names)
use_new = union(use_new,ba_df_ba_df_workvolume_cat2_names)
use_new = union(use_new,ba_df_ba_df_workvolume_cat3_names)
use_new = union(use_new,ba_df_geschlecht_varnames)

use_new <- use_new[2:(length(use_new))] # without intercept

write.csv2(use_new, file = "Misc_files/union_varnames_biglasso_workvolume_det_new.csv")

# Save the union of lasso selected variables that predict gender and the outcome
geschlecht_outcome <- union(ba_df_varnames,ba_df_geschlecht_varnames)
geschlecht_outcome <- geschlecht_outcome[2:length(geschlecht_outcome)] # without intercept
write.csv2(geschlecht_outcome, file = "Misc_files/varnames_biglasso_workvolume_geschlecht.csv")
